cc_number = function()# Get the credit card number
{ 
  params$cc_number
}
cc_number=cc_number()
cc_number
## NULL
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(skimr)
library(ggplot2)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

The Data

data = read.csv('clean_data.csv')

data = data %>% mutate(across(where(is.character),as.factor))

data$cc_num <- as.factor(data$cc_num)
data$zip <- as.factor(data$zip)
data$trans_date_trans_time = as_datetime(data$trans_date_trans_time)

Bucketing User Behaviour

data_bucket = data %>% group_by(cc_num) %>% summarize('count' = n())

Silhouette for profiling

library(cluster)
silhoette_width = sapply(2:20,
                         FUN = function(x) pam(x=data_bucket$count, k=x)$silinfo$avg.width)

ggplot(data=data.frame(cluster = 2:20,silhoette_width), aes(x=cluster,y=silhoette_width))+
  geom_line(col='steelblue',size=1.2)+
  geom_point()+
  scale_x_continuous(breaks=seq(2,20,1))

Use 7 buckets

set.seed(617)
km_profile = kmeans(x=data_bucket$count, centers=7, iter.max=10000, nstart=25)
k_segments_profile = km_profile$cluster
table(k_segments_profile)
## k_segments_profile
##   1   2   3   4   5   6   7 
##  31  57  96 130 247 194 238
data_bucket = cbind(data_bucket, 'profile' = k_segments_profile)
cc_representative = c(
                      (data_bucket %>% arrange(desc(count)) %>% filter(profile == 1))[1,1],
                      (data_bucket %>% arrange(desc(count)) %>% filter(profile == 2))[2,1],
                      (data_bucket %>% arrange(desc(count)) %>% filter(profile == 3))[3,1],
                      (data_bucket %>% arrange(desc(count)) %>% filter(profile == 4))[4,1],
                      (data_bucket %>% arrange(desc(count)) %>% filter(profile == 5))[5,1],
                      (data_bucket %>% arrange(desc(count)) %>% filter(profile == 6))[6,1],
                      (data_bucket %>% arrange(desc(count)) %>% filter(profile == 7))[7,1]
                      )
library(svDialogs)
## Warning: package 'svDialogs' was built under R version 4.1.3
#cc_number <- dlgInput("Enter credit card number", Sys.info()["user"])$res

cc_number = 30270432095985  
 # cc_number = as.numeric(as.character(cc_representative[1]))  
 # cc_number = as.numeric(as.character(cc_representative[2]))
 # cc_number = as.numeric(as.character(cc_representative[3]))
 # cc_number = as.numeric(as.character(cc_representative[4]))
 # cc_number = as.numeric(as.character(cc_representative[5]))
 # cc_number = as.numeric(as.character(cc_representative[6]))
 # cc_number = as.numeric(as.character(cc_representative[7]))

data_indiv = filter(data, cc_num == cc_number)

Total within sum of square plot

within_ss = sapply(1:10,FUN = function(x){
  set.seed(617)
  kmeans(x=data_indiv$amt, centers=x, iter.max=1000, nstart=25)$tot.withinss})
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 219600)
ggplot(data=data.frame(cluster = 1:10,within_ss), aes(x=cluster,y=within_ss))+
  geom_line(col='steelblue',size=1.2)+
  geom_point()+
  scale_x_continuous(breaks=seq(1,10,1))

## Ratio plot

ratio_ss = sapply(1:10,FUN = function(x) {
  set.seed(617)
  km = kmeans(x=data_indiv$amt, centers=x, iter.max=1000, nstart=25)
  km$betweenss/km$totss} )
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 219600)
ggplot(data=data.frame(cluster = 1:10,ratio_ss), aes(x=cluster,y=ratio_ss))+
  geom_line(col='steelblue',size=1.2)+
  geom_point()+
  scale_x_continuous(breaks=seq(1,10,1))

## Silhouette

library(cluster)
silhoette_width = sapply(2:10,
                         FUN = function(x) pam(x=data_indiv$amt, k=x)$silinfo$avg.width)

ggplot(data=data.frame(cluster = 2:10,silhoette_width), aes(x=cluster,y=silhoette_width))+
  geom_line(col='steelblue',size=1.2)+
  geom_point()+
  scale_x_continuous(breaks=seq(2,10,1))

Silhouette suggests 4, 6 or 9

Others suggests 4

Since we want to detect niche, we’ll use 9 clusters

Hierarchical Clustering

d = dist(x = data_indiv$amt ,method = 'euclidean')
clusters = hclust(d = d,method='ward.D2')
h_segments = cutree(tree=clusters, k=9)
table(h_segments)
## h_segments
##    1    2    3    4    5    6    7    8    9 
## 1833 1805  587   47   90    2    6   21    1

K-means Clustering

set.seed(617)
km = kmeans(x=data_indiv$amt, centers=9, iter.max=10000, nstart=25)

k_segments = km$cluster
table(k_segments)
## k_segments
##    1    2    3    4    5    6    7    8    9 
##  687 2149   18    3 1348    1  131   15   40

Model based clustering

library(mclust)
## Package 'mclust' version 5.4.9
## Type 'citation("mclust")' for citing this R package in publications.
m_clusters = Mclust(data=data_indiv$amt)
m_segments = m_clusters$classification
#sort(table(m_segments))

#plot(m_clusters, what = "density", xlim = c(0, 300))
plot(m_clusters, what = "uncertainty", xlim = c(0, 500))
## Warning in rug(data, lwd = 1, col = adjustcolor(par("fg"), alpha.f = 0.8)): some
## values will be clipped

plot(m_clusters, what = 'classification', xlim = c(0, 500))

Kmodes

cat = data_indiv %>% select_if(is.factor) %>%  names()
cat_data = data_indiv[cat]
cat_data = cat_data[,c(3,13)]
library(klaR)
## Warning: package 'klaR' was built under R version 4.1.3
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
hasil = kmodes(cat_data, 7, iter.max = 7, weighted = FALSE, fast = TRUE)
kmode_segments = hasil$cluster
data_clusters = cbind(data_indiv, h_segments, k_segments, m_segments, kmode_segments)
data_clusters = data_clusters[,c(3,4,5,17,21:25)]

Evaluating different clusters on indiv_user

Hierarchical clustering

data_clusters$h_is_fraud_pred = 0

a = attributes(sort(table(h_segments))[1])$name
a = as.numeric(a)

b = attributes(sort(table(h_segments))[2])$name
b = as.numeric(b)

c = attributes(sort(table(h_segments))[3])$name
c = as.numeric(c)

d = attributes(sort(table(h_segments))[4])$name
d = as.numeric(d)

data_clusters$h_is_fraud_pred[data_clusters$h_segments == a] = 1
data_clusters$h_is_fraud_pred[data_clusters$h_segments == b] = 1
data_clusters$h_is_fraud_pred[data_clusters$h_segments == c] = 1
data_clusters$h_is_fraud_pred[data_clusters$h_segments == d] = 1

K clustering

data_clusters$k_is_fraud_pred = 0

a = attributes(sort(table(k_segments))[1])$name
a = as.numeric(a)

b = attributes(sort(table(k_segments))[2])$name
b = as.numeric(b)

c = attributes(sort(table(k_segments))[3])$name
c = as.numeric(c)

d = attributes(sort(table(k_segments))[4])$name
d = as.numeric(d)

data_clusters$k_is_fraud_pred[data_clusters$k_segments == a] = 1
data_clusters$k_is_fraud_pred[data_clusters$k_segments == b] = 1
data_clusters$k_is_fraud_pred[data_clusters$k_segments == c] = 1
data_clusters$k_is_fraud_pred[data_clusters$k_segments == d] = 1

m clustering

data_clusters$m_is_fraud_pred = 0

a = attributes(sort(table(m_segments))[1])$name
a = as.numeric(a)

b = attributes(sort(table(m_segments))[2])$name
b = as.numeric(b)

data_clusters$m_is_fraud_pred[data_clusters$m_segments == a] = 1
data_clusters$m_is_fraud_pred[data_clusters$m_segments == b] = 1

kmode

data_clusters$kmode_is_fraud_pred = 0

a = attributes(sort(table(kmode_segments))[1])$name
a = as.numeric(a)

b = attributes(sort(table(kmode_segments))[2])$name
b = as.numeric(b)

data_clusters$kmode_is_fraud_pred[data_clusters$kmode_segments == a] = 1
data_clusters$kmode_is_fraud_pred[data_clusters$kmode_segments == b] = 1
library(caret)
## Warning: package 'caret' was built under R version 4.1.3
## Loading required package: lattice
data_clusters$ensemble_pred = (data_clusters$m_is_fraud_pred | data_clusters$kmode_is_fraud_pred)

result_matrix = data_clusters[,c(4, 10:14)]

expected_value <- factor(result_matrix$is_fraud)
h_predicted_value <- factor(result_matrix$h_is_fraud_pred)
k_predicted_value <- factor(result_matrix$k_is_fraud_pred)
m_predicted_value <- factor(result_matrix$m_is_fraud_pred)
kmode_predicted_value <- factor(result_matrix$kmode_is_fraud_pred)
ensemble_predicted_value <- factor(result_matrix$ensemble_pred)
 
#Creating confusion matrix
h_cm <- confusionMatrix(data=h_predicted_value, reference = expected_value)
h_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4350   12
##          1   27    3
##                                           
##                Accuracy : 0.9911          
##                  95% CI : (0.9879, 0.9937)
##     No Information Rate : 0.9966          
##     P-Value [Acc > NIR] : 1.00000         
##                                           
##                   Kappa : 0.1294          
##                                           
##  Mcnemar's Test P-Value : 0.02497         
##                                           
##             Sensitivity : 0.9938          
##             Specificity : 0.2000          
##          Pos Pred Value : 0.9972          
##          Neg Pred Value : 0.1000          
##              Prevalence : 0.9966          
##          Detection Rate : 0.9904          
##    Detection Prevalence : 0.9932          
##       Balanced Accuracy : 0.5969          
##                                           
##        'Positive' Class : 0               
## 
k_cm <- confusionMatrix(data=k_predicted_value, reference = expected_value)
k_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4343   12
##          1   34    3
##                                           
##                Accuracy : 0.9895          
##                  95% CI : (0.9861, 0.9923)
##     No Information Rate : 0.9966          
##     P-Value [Acc > NIR] : 1.00000         
##                                           
##                   Kappa : 0.1111          
##                                           
##  Mcnemar's Test P-Value : 0.00196         
##                                           
##             Sensitivity : 0.99223         
##             Specificity : 0.20000         
##          Pos Pred Value : 0.99724         
##          Neg Pred Value : 0.08108         
##              Prevalence : 0.99658         
##          Detection Rate : 0.98884         
##    Detection Prevalence : 0.99158         
##       Balanced Accuracy : 0.59612         
##                                           
##        'Positive' Class : 0               
## 
m_cm <- confusionMatrix(data=m_predicted_value, reference = expected_value)
m_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4240    9
##          1  137    6
##                                          
##                Accuracy : 0.9668         
##                  95% CI : (0.961, 0.9719)
##     No Information Rate : 0.9966         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.0702         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.96870        
##             Specificity : 0.40000        
##          Pos Pred Value : 0.99788        
##          Neg Pred Value : 0.04196        
##              Prevalence : 0.99658        
##          Detection Rate : 0.96539        
##    Detection Prevalence : 0.96744        
##       Balanced Accuracy : 0.68435        
##                                          
##        'Positive' Class : 0              
## 
kmode_cm <- confusionMatrix(data=kmode_predicted_value, reference = expected_value)
kmode_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4116   15
##          1  261    0
##                                           
##                Accuracy : 0.9372          
##                  95% CI : (0.9296, 0.9442)
##     No Information Rate : 0.9966          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.0065         
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9404          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.9964          
##          Neg Pred Value : 0.0000          
##              Prevalence : 0.9966          
##          Detection Rate : 0.9372          
##    Detection Prevalence : 0.9406          
##       Balanced Accuracy : 0.4702          
##                                           
##        'Positive' Class : 0               
## 
ensemble_cm <- confusionMatrix(data=as.factor(as.numeric(ensemble_predicted_value)-1), reference = expected_value)
ensemble_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4005    9
##          1  372    6
##                                           
##                Accuracy : 0.9133          
##                  95% CI : (0.9045, 0.9214)
##     No Information Rate : 0.9966          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0241          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.91501         
##             Specificity : 0.40000         
##          Pos Pred Value : 0.99776         
##          Neg Pred Value : 0.01587         
##              Prevalence : 0.99658         
##          Detection Rate : 0.91189         
##    Detection Prevalence : 0.91393         
##       Balanced Accuracy : 0.65751         
##                                           
##        'Positive' Class : 0               
## 

#compare to logistic regression

library(caTools)
## Warning: package 'caTools' was built under R version 4.1.3
data_clusters2 = data_clusters[,c(2:5)]
set.seed(5205)
split = sample.split(data_clusters2$is_fraud, SplitRatio = 0.7)
train = data_clusters2[split,]
test = data_clusters2[!split,]

model = glm(is_fraud ~., data = train, family = 'binomial')
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
pred = predict(model, newdata = test, type = 'response')

expected_value2 <- factor(test$is_fraud)
glm_predicted_value <- factor(as.integer(pred>0.5))

glm_cm <- confusionMatrix(data=glm_predicted_value, reference = expected_value2)
glm_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1312    5
##          1    1    0
##                                           
##                Accuracy : 0.9954          
##                  95% CI : (0.9901, 0.9983)
##     No Information Rate : 0.9962          
##     P-Value [Acc > NIR] : 0.7625          
##                                           
##                   Kappa : -0.0013         
##                                           
##  Mcnemar's Test P-Value : 0.2207          
##                                           
##             Sensitivity : 0.9992          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.9962          
##          Neg Pred Value : 0.0000          
##              Prevalence : 0.9962          
##          Detection Rate : 0.9954          
##    Detection Prevalence : 0.9992          
##       Balanced Accuracy : 0.4996          
##                                           
##        'Positive' Class : 0               
## 

Spatial Analysis

visualizing data

data_map = filter(data, is_fraud == 1)

library(ggmap)
## Warning: package 'ggmap' was built under R version 4.1.3
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
register_google(key = 'AIzaSyDoFcnGofCofZb2RvD5Bqwnv3buSWarFws')
map = get_map(location=c(-95.7129,37.0902), zoom=4, scale=4)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=37.0902,-95.7129&zoom=4&size=640x640&scale=4&maptype=terrain&language=en-EN&key=xxx
ggmap(map)+
  geom_point(data=data_map, aes(x=merch_long,y=merch_lat), size=0.5, alpha=0.5, color='red')
## Warning: Removed 17 rows containing missing values (geom_point).

data_map2 = data_map %>% group_by(state) %>% summarize('count' = n()) %>% arrange(desc(count))
data_map2
## # A tibble: 49 x 2
##    state count
##    <fct> <int>
##  1 NY      568
##  2 TX      555
##  3 PA      543
##  4 OH      348
##  5 IL      319
##  6 CA      289
##  7 AL      263
##  8 MO      262
##  9 MN      253
## 10 VA      240
## # ... with 39 more rows

analysis with the clusters

merch_lat = data_indiv$merch_lat
merch_long = data_indiv$merch_long

data_spatial = cbind(data_clusters, merch_lat, merch_long)
data_spatial_fraud_pred = filter(data_spatial, m_is_fraud_pred == 1)
data_spatial_fraud = filter(data_spatial, is_fraud == 1)

library(ggmap)
register_google(key = 'AIzaSyDoFcnGofCofZb2RvD5Bqwnv3buSWarFws')
map = get_map(location=c(median(data_spatial$merch_long), median(data_spatial$merch_lat)), zoom=8, scale=4)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=40.669803,-91.023106&zoom=8&size=640x640&scale=4&maptype=terrain&language=en-EN&key=xxx
ggmap(map)+
  geom_point(data=data_spatial, aes(x=merch_long,y=merch_lat), size=1, alpha=0.2, color='seagreen')+
  geom_point(data=data_spatial_fraud_pred, aes(x=merch_long,y=merch_lat), size=1, alpha=1, color='red')

ggmap(map)+
  geom_point(data=data_spatial, aes(x=merch_long,y=merch_lat), size=1, alpha=0.2, color='seagreen')+
  geom_point(data=data_spatial_fraud, aes(x=merch_long,y=merch_lat), size=1, alpha=1, color='red')

# Identifying proportions of fradulent merchant categories

#from predicted
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
cat = table(data_spatial_fraud_pred$category)
cat2 = data.table(round(prop.table(cat)*100,2))

tap = tapply(data_spatial_fraud_pred$amt, data_spatial_fraud_pred$category, sum)
tap = as.data.frame.table(tap)
#data.table(tap)

cat2$sum = tap$Freq
colnames(cat2) = c('category', 'proportions', 'Sum of Purchases ($)')
cat2 =  cat2 %>% arrange(desc(proportions))
colnames(cat2) = c('Merchant', 'Proportions (%)', 'Sum of Purchases ($)')
cat2
##           Merchant Proportions (%) Sum of Purchases ($)
##  1:       misc_net           18.18             13463.53
##  2:       misc_pos           17.48             10992.80
##  3:   shopping_pos           17.48             25191.93
##  4:   shopping_net           10.49             10125.14
##  5:           home            9.09              2810.56
##  6:  personal_care            6.29              1870.20
##  7:  entertainment            4.90              1436.96
##  8:    food_dining            4.90              2117.86
##  9:      kids_pets            4.90              1746.85
## 10: health_fitness            3.50              1136.30
## 11:    grocery_pos            1.40               701.42
## 12:         travel            1.40              1593.92
## 13:  gas_transport            0.00                   NA
## 14:    grocery_net            0.00                   NA
#from actual
library(data.table)
cat = table(data_spatial_fraud$category)
cat2 = data.table(round(prop.table(cat)*100,2))

tap = tapply(data_spatial_fraud$amt, data_spatial_fraud$category, sum)
tap = as.data.frame.table(tap)
#data.table(tap)

cat2$sum = tap$Freq
colnames(cat2) = c('category', 'proportions', 'Sum of Purchases ($)')
cat2 =  cat2 %>% arrange(desc(proportions))
colnames(cat2) = c('Merchant', 'Proportions (%)', 'Sum of Purchases ($)')
cat2
##           Merchant Proportions (%) Sum of Purchases ($)
##  1:    food_dining           20.00               352.20
##  2:  personal_care           20.00                58.26
##  3:    grocery_pos           13.33               701.42
##  4:      kids_pets           13.33                40.16
##  5:   shopping_pos           13.33              2135.94
##  6:  gas_transport            6.67                12.54
##  7:           home            6.67               258.37
##  8:   shopping_net            6.67              1004.38
##  9:  entertainment            0.00                   NA
## 10:    grocery_net            0.00                   NA
## 11: health_fitness            0.00                   NA
## 12:       misc_net            0.00                   NA
## 13:       misc_pos            0.00                   NA
## 14:         travel            0.00                   NA